library(DESeq2)
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.3.2
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.2
## Loading required package: SummarizedExperiment
## Warning: package 'SummarizedExperiment' was built under R version 4.3.2
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(ggplot2)
library(genefilter)
##
## Attaching package: 'genefilter'
## The following objects are masked from 'package:MatrixGenerics':
##
## rowSds, rowVars
## The following objects are masked from 'package:matrixStats':
##
## rowSds, rowVars
library(GenomicFeatures)
## Warning: package 'GenomicFeatures' was built under R version 4.3.2
## Loading required package: AnnotationDbi
library(biomaRt)
## Warning: package 'biomaRt' was built under R version 4.3.2
library(knitr)
library(reshape2)
library(scales)
library(Biostrings)
## Warning: package 'Biostrings' was built under R version 4.3.2
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(kableExtra)
library(pheatmap)
library(org.Mm.eg.db)
##
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.3.2
##
## clusterProfiler v4.10.0 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:XVector':
##
## slice
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library("RColorBrewer")
#read in the ensembl.genes
mm.gtf.db <- makeTxDbFromGFF("annotation/Mus_musculus.GRCm38.102.chr.gtf", format="gtf" )
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
## stop_codon. This information was ignored.
## OK
ensembl.genes = genes(mm.gtf.db)
mouse = useEnsembl(biomart="ENSEMBL_MART_ENSEMBL", dataset="mmusculus_gene_ensembl", version = "102")
bm.annotations = getBM(attributes=c("ensembl_gene_id", "entrezgene_id", "gene_biotype", "mgi_symbol", "description", "external_gene_name"), mart=mouse, filters="ensembl_gene_id", values=ensembl.genes$gene_id, uniqueRows=TRUE)
ensembl.genes$external_gene_name = bm.annotations$external_gene_name[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]
ensembl.genes$gene_biotype = bm.annotations$gene_biotype[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]
ensembl.genes$mgi_symbol = bm.annotations$mgi_symbol[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]
ensembl.genes$description = bm.annotations$description[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]
ensembl.genes$entrezgene_id = bm.annotations$entrezgene_id[ match(ensembl.genes$gene_id, bm.annotations$ensembl_gene_id) ]
#read in the experimental metadata
experimental_metadata = read.delim("experimental_metadata.txt", sep="\t")
#deleting the row that has KPC cachectic replicate 2
#create matrix of data
data = matrix(0, ncol=length(experimental_metadata$name), nrow=55401)
colnames(data)= experimental_metadata$name
for( i in experimental_metadata$name){
data[,i] = read.table(paste("data/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$expected_count
}
row.names(data) = read.table(paste("data/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$gene_id
#Create a factor for the condition column (Cachectic vs. Control) - by making it a factor you give it an order
experimental_metadata$condition = factor(experimental_metadata$condition, levels=c("control", "cachectic"))
#Create a factor for the replicate column - by making it a factor you give it an order
experimental_metadata$replicate = factor(experimental_metadata$replicate, levels=c("1", "2"))
#Create a factor for the cell type column - by making it a factor you give it an order
experimental_metadata$cell_line = factor(experimental_metadata$cell_line, levels = c("KPP", "KPC", "C26", "LLC"))
#Preparing for DESeq - converting everything to be integers
data_mat = apply(round(data), c(1,2), as.integer)
data_mat = subset(data_mat, select= -8) #removing the 8th column that consists of KPC cachectic rep
experimental_metadata = experimental_metadata[-8,]
data_mat = data_mat[!(row.names(data_mat) %in% ensembl.genes$gene_id[ensembl.genes$gene_biotype %in% c("rRNA", "snoRNA", "snRNA", "Mt_rRNA")]),]
data_mat = data_mat[rowSums(data_mat) > 0,]
data_mat = data_mat[ !(row.names(data_mat) %in% ensembl.genes[seqnames(ensembl.genes) == "chrM"]$gene_id), ]
dds = DESeqDataSetFromMatrix(data_mat, experimental_metadata, ~ cell_line + condition)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
rld <- rlog(dds)
sampleDists <- dist(t(assay(rld)))
plot(hclust(sampleDists))
annot = dplyr::select(experimental_metadata, c(cell_line, condition))
row.names(annot) = experimental_metadata$name
rld %>%
assay() %>%
cor() %>%
pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
annotation = annot,
annotation_colors = list(
condition = c(control = "blue", cachectic= "red"),
cell_line = c(C26 = "#BF3100", KPC = "#E9B44C",
KPP = "#1B998B", LLC = "#5D576B")),
cluster_rows = TRUE,
cluster_cols = T,
cellwidth = 13,
cellheight = 13)
ntop = 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select,]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
pca_data <- plotPCA(rld, intgroup = c("cell_line", "condition"), returnData=TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=condition, shape=cell_line)) + geom_point(size=3) +
scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance"), limits=c(-20, 30)) +
scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() + theme_classic() + geom_text(data = pca_data, aes(PC1,PC2, label = name), hjust = 1.2)
PC1 is really messing up with C26 - represents differences in mouse strain, so there are definitely batch/technical artefacts in this dataset - lets use SVA to handle this
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:Biostrings':
##
## collapse
## The following object is masked from 'package:IRanges':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: BiocParallel
dat <- counts(dds, normalized= TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(~ cell_line + condition, colData(dds))
mod0 <- model.matrix(~cell_line , colData(dds)) #
svseq <- svaseq(dat, mod, mod0, n.sv = 2) # allowing n.sv does not enable the linear model to converge
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + cell_line + condition
rld <- rlog(ddssva)
Plotting PCA of SV1 - to analyse if this is the hidden effect we are looking for
(pca_data <- plotPCA(rld, intgroup = c("SV1", "SV2"), returnData=TRUE))
## using ntop=500 top features by variance
## PC1 PC2
## KPP_control_rep1 -8.5659692 4.668301
## KPP_control_rep2 -8.5874817 4.199114
## KPP_cachectic_rep1 -4.1120988 -10.507347
## KPP_cachectic_rep2 -6.0756795 -8.722633
## KPC_control_rep1 -8.2169924 6.852097
## KPC_control_rep2 -8.7191768 6.346721
## KPC_cachectic_rep1 -0.8345102 -10.301588
## C26_control_rep1 13.2829266 14.087862
## C26_control_rep2 14.0758737 13.876702
## C26_cachectic_rep1 21.3412882 -4.676311
## C26_cachectic_rep2 23.0152655 -6.570683
## LLC_control_rep1 -9.3395282 6.940298
## LLC_control_rep2 -9.1146540 6.301143
## LLC_cachectic_rep1 -1.9456672 -15.460528
## LLC_cachectic_rep2 -6.2035960 -7.033147
## group SV1
## KPP_control_rep1 0.094108912016152:-0.0382906543645605 0.094108912
## KPP_control_rep2 0.133402315644739:-0.117630544175785 0.133402316
## KPP_cachectic_rep1 0.21668690853748:0.146729360140979 0.216686909
## KPP_cachectic_rep2 0.355302897129738:0.306700418492008 0.355302897
## KPC_control_rep1 0.108359916290531:-0.114565027644468 0.108359916
## KPC_control_rep2 0.167178845254626:-0.0380267107240203 0.167178845
## KPC_cachectic_rep1 0.00995550326925079:0.0467208833882533 0.009955503
## C26_control_rep1 -0.230124798302928:0.0649040045876099 -0.230124798
## C26_control_rep2 -0.282961574795913:0.229989638111462 -0.282961575
## C26_cachectic_rep1 -0.589434990498833:0.275173714211235 -0.589434990
## C26_cachectic_rep2 -0.458284126490093:-0.306645561503745 -0.458284126
## LLC_control_rep1 0.153209746621263:0.0838660206608385 0.153209747
## LLC_control_rep2 0.0864716410217112:0.217256100722409 0.086471641
## LLC_cachectic_rep1 0.0437252518953962:-0.755813811032454 0.043725252
## LLC_cachectic_rep2 0.192403552406881:-0.000367830869761334 0.192403552
## SV2 name
## KPP_control_rep1 -0.0382906544 KPP_control_rep1
## KPP_control_rep2 -0.1176305442 KPP_control_rep2
## KPP_cachectic_rep1 0.1467293601 KPP_cachectic_rep1
## KPP_cachectic_rep2 0.3067004185 KPP_cachectic_rep2
## KPC_control_rep1 -0.1145650276 KPC_control_rep1
## KPC_control_rep2 -0.0380267107 KPC_control_rep2
## KPC_cachectic_rep1 0.0467208834 KPC_cachectic_rep1
## C26_control_rep1 0.0649040046 C26_control_rep1
## C26_control_rep2 0.2299896381 C26_control_rep2
## C26_cachectic_rep1 0.2751737142 C26_cachectic_rep1
## C26_cachectic_rep2 -0.3066455615 C26_cachectic_rep2
## LLC_control_rep1 0.0838660207 LLC_control_rep1
## LLC_control_rep2 0.2172561007 LLC_control_rep2
## LLC_cachectic_rep1 -0.7558138110 LLC_cachectic_rep1
## LLC_cachectic_rep2 -0.0003678309 LLC_cachectic_rep2
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=SV1)) + geom_point(size=3) +
scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance"), limits = c(-15, 30)) +
scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() + ggtitle("PCA with SV1 highlighted")+ theme_classic() + geom_text(size= 3, data = pca_data, aes(PC1,PC2, label = name), hjust = 0.6)
boxplot(rld$SV1 ~ rld$cell_line, xlab="Cell line", ylab="SV1", ylim=c(-0.6, 0.4))
stripchart(rld$SV1 ~rld$cell_line,
method = "jitter",
pch = 19,
col = 4,
vertical = TRUE,
add = TRUE)
(pca_data <- plotPCA(rld, intgroup = c("SV1", "SV2"), returnData=TRUE))
## using ntop=500 top features by variance
## PC1 PC2
## KPP_control_rep1 -8.5659692 4.668301
## KPP_control_rep2 -8.5874817 4.199114
## KPP_cachectic_rep1 -4.1120988 -10.507347
## KPP_cachectic_rep2 -6.0756795 -8.722633
## KPC_control_rep1 -8.2169924 6.852097
## KPC_control_rep2 -8.7191768 6.346721
## KPC_cachectic_rep1 -0.8345102 -10.301588
## C26_control_rep1 13.2829266 14.087862
## C26_control_rep2 14.0758737 13.876702
## C26_cachectic_rep1 21.3412882 -4.676311
## C26_cachectic_rep2 23.0152655 -6.570683
## LLC_control_rep1 -9.3395282 6.940298
## LLC_control_rep2 -9.1146540 6.301143
## LLC_cachectic_rep1 -1.9456672 -15.460528
## LLC_cachectic_rep2 -6.2035960 -7.033147
## group SV1
## KPP_control_rep1 0.094108912016152:-0.0382906543645605 0.094108912
## KPP_control_rep2 0.133402315644739:-0.117630544175785 0.133402316
## KPP_cachectic_rep1 0.21668690853748:0.146729360140979 0.216686909
## KPP_cachectic_rep2 0.355302897129738:0.306700418492008 0.355302897
## KPC_control_rep1 0.108359916290531:-0.114565027644468 0.108359916
## KPC_control_rep2 0.167178845254626:-0.0380267107240203 0.167178845
## KPC_cachectic_rep1 0.00995550326925079:0.0467208833882533 0.009955503
## C26_control_rep1 -0.230124798302928:0.0649040045876099 -0.230124798
## C26_control_rep2 -0.282961574795913:0.229989638111462 -0.282961575
## C26_cachectic_rep1 -0.589434990498833:0.275173714211235 -0.589434990
## C26_cachectic_rep2 -0.458284126490093:-0.306645561503745 -0.458284126
## LLC_control_rep1 0.153209746621263:0.0838660206608385 0.153209747
## LLC_control_rep2 0.0864716410217112:0.217256100722409 0.086471641
## LLC_cachectic_rep1 0.0437252518953962:-0.755813811032454 0.043725252
## LLC_cachectic_rep2 0.192403552406881:-0.000367830869761334 0.192403552
## SV2 name
## KPP_control_rep1 -0.0382906544 KPP_control_rep1
## KPP_control_rep2 -0.1176305442 KPP_control_rep2
## KPP_cachectic_rep1 0.1467293601 KPP_cachectic_rep1
## KPP_cachectic_rep2 0.3067004185 KPP_cachectic_rep2
## KPC_control_rep1 -0.1145650276 KPC_control_rep1
## KPC_control_rep2 -0.0380267107 KPC_control_rep2
## KPC_cachectic_rep1 0.0467208834 KPC_cachectic_rep1
## C26_control_rep1 0.0649040046 C26_control_rep1
## C26_control_rep2 0.2299896381 C26_control_rep2
## C26_cachectic_rep1 0.2751737142 C26_cachectic_rep1
## C26_cachectic_rep2 -0.3066455615 C26_cachectic_rep2
## LLC_control_rep1 0.0838660207 LLC_control_rep1
## LLC_control_rep2 0.2172561007 LLC_control_rep2
## LLC_cachectic_rep1 -0.7558138110 LLC_cachectic_rep1
## LLC_cachectic_rep2 -0.0003678309 LLC_cachectic_rep2
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=SV2)) + geom_point(size=3) +
scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance")) +
scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() + ggtitle("PCA for whole dataset")+ theme_classic() + geom_text(size= 3, data = pca_data, aes(PC1,PC2, label = name), hjust = 0.6)
boxplot(rld$SV2 ~ rld$cell_line, xlab="Cell line", ylab="SV2", ylim=c(-0.8, 0.4))
stripchart(rld$SV2 ~rld$cell_line,
method = "jitter",
pch = 19,
col = 4,
vertical = TRUE,
add = TRUE)
##PCA after removingBatchEffect
assay(rld) <- limma::removeBatchEffect(assay(rld), covariates=rld$SV1)
assay(rld) <- limma::removeBatchEffect(assay(rld), covariates = rld$SV2)
set.seed(1)
(pca_data <- plotPCA(rld, intgroup = c("condition", "cell_line"), returnData=TRUE))
## using ntop=500 top features by variance
## PC1 PC2 group condition cell_line
## KPP_control_rep1 -6.566336 1.5310023 control:KPP control KPP
## KPP_control_rep2 -6.687889 2.1484997 control:KPP control KPP
## KPP_cachectic_rep1 13.974273 6.3902714 cachectic:KPP cachectic KPP
## KPP_cachectic_rep2 13.417139 5.2760307 cachectic:KPP cachectic KPP
## KPC_control_rep1 -9.799699 -3.7412844 control:KPC control KPC
## KPC_control_rep2 -8.575491 -6.7337349 control:KPC control KPC
## KPC_cachectic_rep1 10.850402 -13.4364170 cachectic:KPC cachectic KPC
## C26_control_rep1 -11.244394 4.6297399 control:C26 control C26
## C26_control_rep2 -9.381332 3.4055672 control:C26 control C26
## C26_cachectic_rep1 9.660820 -2.1739595 cachectic:C26 cachectic C26
## C26_cachectic_rep2 6.739228 2.2287239 cachectic:C26 cachectic C26
## LLC_control_rep1 -8.378481 0.2811608 control:LLC control LLC
## LLC_control_rep2 -6.629013 -0.5519275 control:LLC control LLC
## LLC_cachectic_rep1 6.243037 2.6565163 cachectic:LLC cachectic LLC
## LLC_cachectic_rep2 6.377735 -1.9101889 cachectic:LLC cachectic LLC
## name
## KPP_control_rep1 KPP_control_rep1
## KPP_control_rep2 KPP_control_rep2
## KPP_cachectic_rep1 KPP_cachectic_rep1
## KPP_cachectic_rep2 KPP_cachectic_rep2
## KPC_control_rep1 KPC_control_rep1
## KPC_control_rep2 KPC_control_rep2
## KPC_cachectic_rep1 KPC_cachectic_rep1
## C26_control_rep1 C26_control_rep1
## C26_control_rep2 C26_control_rep2
## C26_cachectic_rep1 C26_cachectic_rep1
## C26_cachectic_rep2 C26_cachectic_rep2
## LLC_control_rep1 LLC_control_rep1
## LLC_control_rep2 LLC_control_rep2
## LLC_cachectic_rep1 LLC_cachectic_rep1
## LLC_cachectic_rep2 LLC_cachectic_rep2
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=condition, shape=cell_line)) + geom_point(size=3) +
scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance"), limits = c(-20,20)) +
scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance"), limits = c(-15, 10)) +
ggtitle("PCA after correction")+ theme_classic() + geom_text(size= 3, data = pca_data, aes(PC1,PC2, label = name), hjust = 0.2)
sampleDists <- dist(t(assay(rld)))
plot(hclust(sampleDists))
#filter
filter = apply(counts(ddssva, normalized=TRUE), 1, function(x){ mean(x) >= 1 })
ddssva = ddssva[filter, ]
ddssva <- estimateSizeFactors(ddssva)
ddssva <- estimateDispersions(ddssva)
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
ddssva = nbinomLRT(ddssva, full= ~1 + cell_line + SV1 + SV2 + condition, reduced = ~ 1 + cell_line + SV1 + SV2)
results.lrt = results(ddssva)
cachectic_vs_control = results(ddssva, contrast=c("condition", "cachectic", "control"), independentFiltering = TRUE, alpha=0.1)
cachectic_vs_control <- lfcShrink(ddssva,
coef = "condition_cachectic_vs_control", res=cachectic_vs_control, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
plotMA(cachectic_vs_control, colSig="red")
Plotting volcano plots to visualise the differential expression
#volcano plots for differences between control and cachectic
par(mar=c(5,5,5,5), cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
# Adjusted P values (FDR Q values)
with(cachectic_vs_control, plot(log2FoldChange, -log10(padj), pch=20, main="Volcano plot of control vs. cachectic", cex=1.0, xlab=bquote(~Log[2]~fold~change), ylab=bquote(~-log[10]~padj~value) ) )
with(subset(cachectic_vs_control, padj<0.1 & log2FoldChange> 1), points(log2FoldChange, -log10(padj), pch=20, col="red", cex=0.5))
with(subset(cachectic_vs_control, padj<0.1 & log2FoldChange< -1), points(log2FoldChange, -log10(padj), pch=20, col="blue", cex=0.5))
#Add lines for absolute FC>2 and P-value cut-off at FDR Q<0.1
abline(v=0, col="black", lty=3, lwd=1.0)
abline(v=-1, col="black", lty=4, lwd=2.0)
abline(v=1, col="black", lty=4, lwd=2.0)
abline(h=-log10(0.1), col="black", lty=4, lwd=2.0)
legend("right", legend=c("Upregulated", "Downregulated", "Not"),fill= c("red", "blue", "black"))
So there are only two experimental conditions we care about in this analysis, however it would be good to visualise expression across the cell line, so lets plot a heatmap with two clusters (those upregulated and those downregulated)
rld <- rlog(ddssva)
assay(rld) <- limma::removeBatchEffect(assay(rld), covariates=rld$SV1)
assay(rld) <- limma::removeBatchEffect(assay(rld), covariates = rld$SV2)
significant_results=cachectic_vs_control[!is.na(cachectic_vs_control$padj) & cachectic_vs_control$padj<0.1,] ##10% are false positives.
rld_signif = assay(rld)[rownames(significant_results),]
#how many genes are significantly different?
nrow(rld_signif) #6465
## [1] 6465
rld_z = t(apply(rld_signif, 1, function(x){ (x - mean(x)) / sd(x)}))
thr = 3 ##threshold of 3 sd away
rld_z[rld_z > thr] = thr ## setting rld_z values > 3 as 3
rld_z[rld_z < -thr] = -thr ## setting rld_z values <-3 as -3
paletteLength = 20 ##making a pheatmap
breaksList <- c(seq(-thr, 0, length.out=ceiling(paletteLength/2) + 1),
seq(thr/paletteLength, thr, length.out=floor(paletteLength/2)))
# sort out colour scheme
color = c(colorRampPalette(c("mediumblue", "white"))(14), colorRampPalette(c("white", "firebrick2"))(14))
## from blue-->white--> brick red
breaksList = seq(-3, 3, length.out = 29)
paletteLength = 20 ##making a pheatmap
breaksList <- c(seq(-thr, 0, length.out=ceiling(paletteLength/2) + 1),
seq(thr/paletteLength, thr, length.out=floor(paletteLength/2)))
color = c(colorRampPalette(c("mediumblue", "white"))(14), colorRampPalette(c("white", "firebrick2"))(14))
## from blue-->white--> brick red
breaksList = seq(-3, 3, length.out = 29)
set.seed(2)
nclust = 2
results.coef.kmeans = kmeans(rld_z, nclust, nstart=100, iter.max=50)
results.coef = rld_z[order(results.coef.kmeans$cluster, decreasing=TRUE),]
indicator = results.coef.kmeans$cluster[order(results.coef.kmeans$cluster)] ## only want the cluster data
table(indicator)
## indicator
## 1 2
## 3249 3216
heat.map <- pheatmap(results.coef, cluster_col=TRUE, breaks=breaksList, cluster_rows=FALSE, show_rownames=FALSE,color = color,fontsize_row = 3, legend=TRUE,border_color = NA, )
cachectic_vs_control.df = as.data.frame(cachectic_vs_control)
cachectic_vs_control.df$mgi_symbol = bm.annotations$mgi_symbol[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$gene_biotype = bm.annotations$gene_biotype[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$description = bm.annotations$description[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$entrezgene_id = bm.annotations$entrezgene_id[ match(row.names(cachectic_vs_control.df), bm.annotations$ensembl_gene_id) ]
cachectic_vs_control.df$cluster = NA
cachectic_vs_control.df[names(results.coef.kmeans$cluster),]$cluster = ifelse(results.coef.kmeans$cluster==2, "UP", "DOWN")
write.csv(cachectic_vs_control.df, "./results/talbert_mouse_df.csv")
Annotating the clusters with the terms/processes/pathways that they are enriched with. Do these annotations make sense given the patterns in the clusters? Remember that clustering ordering changes between runs so make sure to remember to saveRDS and readRDS.
c1 = row.names(cachectic_vs_control.df[!is.na(cachectic_vs_control.df$cluster) & cachectic_vs_control.df$cluster=="UP",])
heat.map.c1 <- pheatmap(results.coef[c1,], cluster_col=TRUE, breaks=breaksList, cluster_rows=FALSE, show_rownames=FALSE,color = color,fontsize_row = 3, legend=TRUE,border_color = NA, main = "Cluster 1")
ego.BP_c1 <- enrichGO(gene = c1,
universe = rownames(dds),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = TRUE)
write.csv(as.data.frame(ego.BP_c1), "./results/go_bp_up.csv")
saveRDS(ego.BP_c1, "rds/ego.BP_up.rds")
dotplot(ego.BP_c1, title="GO:BP Cluster 1")
ego.MF_c1 <- enrichGO(gene = c1,
universe = rownames(dds),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = TRUE)
write.csv(as.data.frame(ego.MF_c1), "./results/go_mf_up.csv")
saveRDS(ego.MF_c1, "rds/ego.MF_up.rds")
dotplot(ego.MF_c1, title="GO:MF Cluster 1")
ego.CC_c1 <- enrichGO(gene = c1,
universe = rownames(dds),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = TRUE)
write.csv(as.data.frame(ego.CC_c1), "./results/go_cc_up.csv")
saveRDS(ego.CC_c1, "rds/ego.CC_up.rds")
dotplot(ego.CC_c1, title="GO:CC Cluster 1")
kegg_c1 <- enrichKEGG( gene= as.character(ensembl.genes[c1,]$entrezgene_id),
universe = as.character(ensembl.genes[rownames(dds),]$entrezgene_id),
organism = 'mmu' ,
pvalueCutoff = 1,
qvalueCutoff = 1)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/mmu/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/mmu"...
write.csv(as.data.frame(kegg_c1), "./results/kegg_up.csv")
saveRDS(kegg_c1, "rds/KEGG_up.rds")
dotplot(kegg_c1, title= "KEGG Cluster 1")
c2 = row.names(cachectic_vs_control.df[!is.na(cachectic_vs_control.df$cluster) & cachectic_vs_control.df$cluster=="DOWN",])
heat.map.c2 <- pheatmap(results.coef[c2,], cluster_col=TRUE, breaks=breaksList, cluster_rows=FALSE, show_rownames=FALSE,color = color,fontsize_row = 3, legend=TRUE,border_color = NA, main = "Cluster 2")
ego.BP_c2 <- enrichGO(gene = c2,
universe = rownames(dds),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = TRUE)
write.csv(as.data.frame(ego.BP_c2), "./results/go_bp_dn.csv")
saveRDS(ego.BP_c2, "rds/ego.BP_dn.rds")
dotplot(ego.BP_c2, title="GO:BP Cluster 2")
ego.MF_c2 <- enrichGO(gene = c2,
universe = rownames(dds),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = TRUE)
write.csv(as.data.frame(ego.MF_c2), "./results/go_mf_dn.csv")
saveRDS(ego.MF_c2, "rds/ego.MF_dn.rds")
dotplot(ego.MF_c2, title="GO:MF Cluster 2")
ego.CC_c2 <- enrichGO(gene = c2,
universe = rownames(dds),
OrgDb = org.Mm.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = TRUE)
saveRDS(ego.CC_c2, "rds/ego.CC_dn.rds")
write.csv(as.data.frame(ego.CC_c2), "./results/go_cc_dn.csv")
dotplot(ego.CC_c2, title="GO:CC Cluster 2")
kegg_c2 <- enrichKEGG( gene= as.character(ensembl.genes[c2,]$entrezgene_id),
universe = as.character(ensembl.genes[rownames(dds),]$entrezgene_id),
organism = 'mmu' ,
pvalueCutoff=1,
qvalueCutoff =1)
write.csv(as.data.frame(kegg_c2), "./results/kegg_dn.csv")
saveRDS(kegg_c2, "rds/KEGG_dn.rds")
dotplot(kegg_c2, title= "KEGG Cluster 2")
test <- simplify(ego.BP_c1, cutoff=0.7, by="qvalue", select_fun= min, measure="Wang")
library(stringr)
up.terms = rbind(
ego.BP_c1[ego.BP_c1$Description %in% c("mRNA processing", "ribosome biogenesis", "regulation of apoptotic signaling pathway", "muscle cell proliferation"),
c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")],
ego.MF_c1[ego.MF_c1$Description %in% c("RNA polymerase II-specific DNA-binding transcription factor binding", "structural constituent of ribosome"),
c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")],
kegg_c1[kegg_c1$Description %in% c("HIF-1 signaling pathway - Mus musculus (house mouse)", "Mitophagy - animal - Mus musculus (house mouse)", "Notch signaling pathway - Mus musculus (house mouse)"),
c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")]
)
up.terms$ontology = c(rep("GO:BP", 4), rep("GO:MF", 2), rep("KEGG", 3))
up.terms$count = sapply(str_split(up.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])})
up.terms$gr = sapply(str_split(up.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})
up.terms$bgr = sapply(str_split(up.terms$BgRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})
up.terms$bg = sapply(str_split(up.terms$BgRatio, "/"), function(x){ as.numeric(x[1])})
up.terms$gb = up.terms$count / up.terms$bg
up.terms$category = "UP"
down.terms = rbind(
ego.BP_c2[ego.BP_c2$Description %in% c("muscle cell differentiation", "collagen fibril organization", "tricarboxylic acid cycle", "response to oxygen levels"),
c("ID", "Description", "GeneRatio", "pvalue", "p.adjust", "qvalue", "BgRatio")],
ego.CC_c2[ego.CC_c2$Description %in% c("myofibril", "respirasome"),
c("ID", "Description", "BgRatio", "GeneRatio", "pvalue", "p.adjust", "qvalue")],
kegg_c2[kegg_c2$Description %in% c("Oxidative phosphorylation - Mus musculus (house mouse)", "Cardiac muscle contraction - Mus musculus (house mouse)", "Focal adhesion - Mus musculus (house mouse)"),
c("ID", "Description", "GeneRatio", "BgRatio", "pvalue", "p.adjust", "qvalue")]
)
down.terms$ontology = c(rep("GO:BP", 4), rep("GO:CC", 2), rep("KEGG", 3))
down.terms$count = sapply(str_split(down.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])})
down.terms$gr = sapply(str_split(down.terms$GeneRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})
down.terms$bgr = sapply(str_split(down.terms$BgRatio, "/"), function(x){ as.numeric(x[1])/as.numeric(x[2])})
down.terms$bg = sapply(str_split(down.terms$BgRatio, "/"), function(x){ as.numeric(x[1])})
down.terms$gb = down.terms$count / down.terms$bg
down.terms$category = "DOWN"
terms.to.plot = rbind(up.terms, down.terms)
terms.to.plot$Description = factor(terms.to.plot$Description,
rev(c("mRNA processing",
"ribosome biogenesis",
"regulation of apoptotic signaling pathway",
"muscle cell proliferation",
"RNA polymerase II-specific DNA-binding transcription factor binding",
"structural constituent of ribosome",
"HIF-1 signaling pathway - Mus musculus (house mouse)",
"Mitophagy - animal - Mus musculus (house mouse)",
"Notch signaling pathway - Mus musculus (house mouse)",
"muscle cell differentiation",
"collagen fibril organization",
"tricarboxylic acid cycle",
"response to oxygen levels",
"myofibril", "respirasome",
"Oxidative phosphorylation - Mus musculus (house mouse)",
"Cardiac muscle contraction - Mus musculus (house mouse)",
"Focal adhesion - Mus musculus (house mouse)")))
ggplot(terms.to.plot, aes(x=gb, y=Description, size=count, colour=qvalue)) + geom_point() + scale_x_continuous("Fraction of annotated genes", limits=c(0, 1)) + theme_bw() + scale_color_continuous(limits=c(0, 0.05), high="blue", low="red") + facet_grid(~category)
ggplot(terms.to.plot, aes(x=gr, y=Description, size=count, colour=qvalue)) + geom_point() + scale_x_continuous("Gene Ratio", limits=c(0, 0.1)) + theme_bw() + scale_color_continuous(limits=c(0, 0.05), high="blue", low="red") + facet_grid(~category)
genes_plot <- data.frame(
model = ddssva$cell_line,
condition = ddssva$condition,
expression = counts(ddssva, normalized = TRUE)[c("ENSMUSG00000005583"), ])
genes_plot %>%
ggplot(aes(x = condition, y = expression, group = condition, fill=condition)) +
geom_boxplot() +
#geom_text(aes(label=round(value,2)), vjust=-0.3, size=3.5) +
xlab("Samples") +
ylab("Normalised Counts") +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
geom_point(data=genes_plot, aes(x=condition, y=expression, group=condition), size=2) + scale_y_continuous(limits=c(5000, 25000)) + facet_grid(~model)
saveRDS(ddssva, "rds/ddssva.rds")
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] stringr_1.5.1 sva_3.50.0
## [3] BiocParallel_1.36.0 mgcv_1.9-1
## [5] nlme_3.1-164 RColorBrewer_1.1-3
## [7] clusterProfiler_4.10.0 org.Mm.eg.db_3.18.0
## [9] pheatmap_1.0.12 kableExtra_1.4.0
## [11] Biostrings_2.70.2 XVector_0.42.0
## [13] scales_1.3.0 reshape2_1.4.4
## [15] knitr_1.45 biomaRt_2.58.2
## [17] GenomicFeatures_1.54.3 AnnotationDbi_1.64.1
## [19] genefilter_1.84.0 ggplot2_3.5.0
## [21] DESeq2_1.42.0 SummarizedExperiment_1.32.0
## [23] Biobase_2.62.0 MatrixGenerics_1.14.0
## [25] matrixStats_1.2.0 GenomicRanges_1.54.1
## [27] GenomeInfoDb_1.38.6 IRanges_2.36.0
## [29] S4Vectors_0.40.2 BiocGenerics_0.48.1
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.1 BiocIO_1.12.0 bitops_1.0-7
## [4] ggplotify_0.1.2 filelock_1.0.3 tibble_3.2.1
## [7] polyclip_1.10-6 XML_3.99-0.16.1 lifecycle_1.0.4
## [10] mixsqp_0.3-54 edgeR_4.0.16 lattice_0.22-5
## [13] MASS_7.3-60.0.1 magrittr_2.0.3 limma_3.58.1
## [16] sass_0.4.8 rmarkdown_2.25 jquerylib_0.1.4
## [19] yaml_2.3.8 cowplot_1.1.3 DBI_1.2.2
## [22] abind_1.4-5 zlibbioc_1.48.0 purrr_1.0.2
## [25] ggraph_2.2.0 RCurl_1.98-1.14 yulab.utils_0.1.4
## [28] tweenr_2.0.3 rappdirs_0.3.3 GenomeInfoDbData_1.2.11
## [31] enrichplot_1.22.0 ggrepel_0.9.5 irlba_2.3.5.1
## [34] tidytree_0.4.6 annotate_1.80.0 svglite_2.1.3
## [37] codetools_0.2-19 DelayedArray_0.28.0 DOSE_3.28.2
## [40] xml2_1.3.6 ggforce_0.4.2 tidyselect_1.2.1
## [43] aplot_0.2.2 farver_2.1.1 viridis_0.6.5
## [46] BiocFileCache_2.10.1 GenomicAlignments_1.38.2 jsonlite_1.8.8
## [49] tidygraph_1.3.1 survival_3.5-8 systemfonts_1.0.5
## [52] tools_4.3.1 progress_1.2.3 treeio_1.26.0
## [55] Rcpp_1.0.12 glue_1.7.0 gridExtra_2.3
## [58] SparseArray_1.2.4 xfun_0.42 qvalue_2.34.0
## [61] dplyr_1.1.4 withr_3.0.0 fastmap_1.1.1
## [64] fansi_1.0.6 truncnorm_1.0-9 digest_0.6.35
## [67] R6_2.5.1 gridGraphics_0.5-1 colorspace_2.1-0
## [70] GO.db_3.18.0 RSQLite_2.3.5 utf8_1.2.4
## [73] tidyr_1.3.1 generics_0.1.3 data.table_1.15.0
## [76] rtracklayer_1.62.0 prettyunits_1.2.0 graphlayouts_1.1.0
## [79] httr_1.4.7 S4Arrays_1.2.0 scatterpie_0.2.1
## [82] pkgconfig_2.0.3 gtable_0.3.4 blob_1.2.4
## [85] shadowtext_0.1.3 htmltools_0.5.7 fgsea_1.28.0
## [88] png_0.1-8 ashr_2.2-63 ggfun_0.1.4
## [91] rstudioapi_0.15.0 rjson_0.2.21 curl_5.2.0
## [94] cachem_1.0.8 parallel_4.3.1 HDO.db_0.99.1
## [97] restfulr_0.0.15 pillar_1.9.0 grid_4.3.1
## [100] vctrs_0.6.5 dbplyr_2.4.0 xtable_1.8-4
## [103] evaluate_0.23 invgamma_1.1 cli_3.6.2
## [106] locfit_1.5-9.8 compiler_4.3.1 Rsamtools_2.18.0
## [109] rlang_1.1.3 crayon_1.5.2 SQUAREM_2021.1
## [112] labeling_0.4.3 plyr_1.8.9 fs_1.6.3
## [115] stringi_1.8.3 viridisLite_0.4.2 munsell_0.5.1
## [118] lazyeval_0.2.2 GOSemSim_2.28.1 Matrix_1.6-5
## [121] hms_1.1.3 patchwork_1.2.0 bit64_4.0.5
## [124] statmod_1.5.0 KEGGREST_1.42.0 highr_0.10
## [127] igraph_2.0.2 memoise_2.0.1 bslib_0.6.1
## [130] ggtree_3.10.1 fastmatch_1.1-4 bit_4.0.5
## [133] ape_5.7-1 gson_0.1.0